Predicting New Construction in Philadelphia

MUSA 508 Final Project

Author

Laura Frances and Nissim Lebovits

Published

December 3, 2023

1 Summary

2 Introduction

Show the code
required_packages <- c("tidyverse", "sf", "acs", "tidycensus", "sfdep", "kableExtra", "conflicted",
                       "gganimate", "tmap", "gifski", "transformr", "ggpubr", "randomForest", "janitor",
                       'igraph')
install_and_load_packages(required_packages)

source("utils/viz_utils.R")

select <- dplyr::select
filter <- dplyr::filter
lag <- dplyr::lag

options(scipen = 999, tigris_use_cache = TRUE, tigris_class = 'sf')

crs <- 'epsg:2272'
  1. Predict development pressure: how do we define “a lot of development”?

  2. Define affordability burden: how do we define “affordability burden”? – % change year over year in population that is experience rate burden (will probably see extreme tipping points), growing population, % change in area incomes

  3. Identify problem zoning

  4. Calculate number of connected parcels

  5. Predict development pressure at the block level

  6. Identify not burdened areas

  7. Identify problem zoning

  8. Calcualte number of connected parcels

  9. Advocate for upzoning in parcels where there is local development pressure, no affordability burden, problem zoning, and high number of connected parcels

  • transit
  • zoning (OSM)
  • land use (OSM)
Show the code
urls <- c(
  phl = "https://opendata.arcgis.com/datasets/405ec3da942d4e20869d4e1449a2be48_0.geojson",
  phl_bgs = "https://opendata.arcgis.com/datasets/2f982bada233478ea0100528227febce_0.geojson",
  nbhoods = "https://raw.githubusercontent.com/opendataphilly/open-geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson",
  roads = 'https://opendata.arcgis.com/datasets/261eeb49dfd44ccb8a4b6a0af830fdc8_0.geojson',
  historic_districts = "https://phl.carto.com/api/v2/sql?q=SELECT+*+FROM+historicdistricts_local&filename=historicdistricts_local&format=geojson&skipfields=cartodb_id",
  zoning = "https://opendata.arcgis.com/datasets/0bdb0b5f13774c03abf8dc2f1aa01693_0.geojson",
  overlays = "https://opendata.arcgis.com/datasets/04fd29a8c022471994900cb0fd791bfc_0.geojson",
  opa = "data/opa_properties.geojson",
  building_permits = building_permits_path,
  acs14 = acs_vars14_path,
  acs19 = acs_vars19_path,
  trolley_stops = "data/Trolley_Stations.geojson",
  subway_stops = "data/Highspeed_Stations.geojson"
)

suppressMessages({
  invisible(
    imap(urls, ~ assign(.y, phl_spat_read(.x), envir = .GlobalEnv))
  )
})

phl_bgs <- phl_bgs %>%
              select(GEOID10)

broad_and_market <- roads %>% filter(ST_NAME %in% c('BROAD',"MARKET") | SEG_ID %in% c(440370, 421347,421338,421337,422413,423051,440403,440402,440391,440380))

subway_stops <- subway_stops[phl, ]
transit_stops <- st_union(trolley_stops, subway_stops)

historic_districts <- historic_districts %>%
                        mutate(hist_dist = name) %>%
                        select(hist_dist)

nbhoods <- nbhoods %>%
            select(mapname)

overlays <- overlays %>% clean_names()
overlays$overlay_symbol <- gsub("/", "", overlays$overlay_symbol) 
overlays <- overlays %>%
              mutate(overlay_symbol = ifelse(overlay_symbol == "[NA]", "Other", overlay_symbol))
                        
building_permits <- building_permits %>%
                      filter(permittype %in% c("RESIDENTIAL BUILDING", "BP_ADDITION", "BP_NEWCNST"))

acs_reg_vars <- c(
  "GEOID",
  "Total_Pop", 
  "Med_Inc",
  "Percent_Nonwhite",
  "Percent_Renters",
  "Rent_Burden",
  "Ext_Rent_Burden"
  )

acs14_reg_vars <- phl_spat_read(acs_vars14_path) %>%
                    st_drop_geometry() %>%
                    select(acs_reg_vars) %>%
                    clean_names() %>%
                    rename(GEOID10 = geoid)

acs19_reg_vars <- phl_spat_read(acs_vars19_path) %>%
                    st_drop_geometry() %>%
                    select(acs_reg_vars) %>%
                    clean_names() %>%
                    rename(GEOID10 = geoid)

acs_tot <- acs14_reg_vars %>%
             left_join(acs19_reg_vars, by = "GEOID10", suffix = c("_14", "_19")) %>%
             mutate(pct_med_inc_change = med_inc_19 - med_inc_14 / med_inc_14,
                    pct_tot_pop_change = total_pop_19 - total_pop_14 / total_pop_14,
                    pct_renters_change = percent_renters_19 - percent_renters_14,
                    pct_rent_burdenchange = rent_burden_19 - rent_burden_14)
Show the code
ggplot(phl_bgs) +
  geom_sf() +
  theme_void()

Show the code
phl_bgs <- phl_spat_read("https://opendata.arcgis.com/datasets/2f982bada233478ea0100528227febce_0.geojson") %>%
              select(GEOID10)

# Create a complete grid of GEOID10 and year
geoid_years <- expand.grid(GEOID10 = unique(phl_bgs$GEOID10),
                           year = unique(building_permits$year))



# Joining your existing data with the complete grid
permits_bg <- st_join(phl_bgs, building_permits) %>%
              group_by(GEOID10, year) %>%
              summarize(permits_count = sum(permits_count, na.rm = TRUE)) %>%
              st_drop_geometry() %>%
              right_join(geoid_years, by = c("GEOID10", "year")) %>%
              replace_na(list(permits_count = 0)) %>%
              left_join(phl_bgs, by = "GEOID10") %>%
              st_as_sf() 


### spat + temp lags----------------------------------
suppressMessages(
permits_bg <- permits_bg %>%
              group_by(year) %>%
              mutate(nb = st_knn(geometry, 5),
                         wt = st_weights(nb),
                      lag_spatial = st_lag(permits_count, nb, wt)) %>% # calculate spat lag
              ungroup()%>%
              arrange(GEOID10, year) %>% # calculate time lag
               mutate(
                 lag_1_year = lag(permits_count, 1),
                 lag_2_years = lag(permits_count, 2),
                 lag_3_years = lag(permits_count, 3),
                 lag_4_years = lag(permits_count, 4),
                 lag_5_years = lag(permits_count, 5),
                 lag_6_years = lag(permits_count, 6),
                 lag_7_years = lag(permits_count, 7),
                 lag_8_years = lag(permits_count, 8),
                 lag_9_years = lag(permits_count, 9),
                 dist_to_2022 = 2022 - year
               )
)

### distance to transit---------------------------------
phl_bgs <- phl_bgs %>%
  rowwise() %>%
  mutate(
    dist_to_transit = as.numeric(min(st_distance(st_point_on_surface(geometry), transit_stops$geometry)))
  ) %>%
  ungroup()

### historic dists---------------------------------
hist_dists_x_bg <- st_join(phl_bgs, historic_districts) %>%
              mutate(hist_dist = ifelse(is.na(hist_dist), 0, 1))

hist_dists_x_bg <- st_join(phl_bgs, historic_districts) %>%
                      st_drop_geometry() %>%
                      mutate(hist_dist_present = 1) %>%
                      group_by(GEOID10, hist_dist) %>%
                      summarize(hist_dist_present = max(hist_dist_present, na.rm = TRUE), .groups = 'drop') %>%
                      pivot_wider(names_from = "hist_dist", values_from = "hist_dist_present", 
                                  names_prefix = "hist_dist_", values_fill = list("hist_dist_present" = 0))

phl_bgs <- left_join(phl_bgs, hist_dists_x_bg, by = "GEOID10")

### hoods---------------------------------              
intersection <- st_intersection(phl_bgs, nbhoods)
intersection$intersection_area <- st_area(intersection)
max_intersection <- intersection %>%
  group_by(GEOID10) %>%
  filter(intersection_area == max(intersection_area)) %>%
  ungroup() %>%
  select(GEOID10, mapname) %>%
  st_drop_geometry()

bgs_w_hood <- left_join(phl_bgs, max_intersection, by = c("GEOID10" = "GEOID10")) %>%
                st_drop_geometry()

phl_bgs <- left_join(phl_bgs, bgs_w_hood, by = "GEOID10")

### overlays---------------------------------
overlays_x_bg <- st_join(phl_bgs, overlays %>% select(overlay_symbol)) %>%
                      st_drop_geometry() %>%
                      mutate(overlay_present = 1) %>%
                      group_by(GEOID10, overlay_symbol) %>%
                      summarize(overlay_present = max(overlay_present, na.rm = TRUE), .groups = 'drop') %>%
                      pivot_wider(names_from = "overlay_symbol", values_from = "overlay_present", 
                                  names_prefix = "overlay_", values_fill = list("overlay_present" = 0))

phl_bgs <- left_join(phl_bgs, overlays_x_bg, by = "GEOID10")


### join back together----------------------
permits_bg <- left_join(permits_bg,
                     st_drop_geometry(phl_bgs),
                     by = "GEOID10")

### acs vars--------------------------------------------

permits_pre_2019 <- filter(permits_bg, year < 2019)
permits_post_2018 <- filter(permits_bg, year >= 2019)


permits_joined_pre_2019 <- left_join(permits_pre_2019, acs14_reg_vars, by = "GEOID10")
permits_joined_post_2018 <- left_join(permits_post_2018, acs19_reg_vars, by = "GEOID10")


permits_bg <- bind_rows(permits_joined_pre_2019, permits_joined_post_2018)

### clean-----------------------------------------------------
permits_bg <- permits_bg %>%
                select(-c(nb, wt, GEOID10)) %>%
                clean_names()
Show the code
tm <- tm_shape(permits_bg %>% filter(year != 2012)) +
        tm_polygons(col = "permits_count", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_facets(along = "year") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE) 

suppressMessages(
tmap_animation(tm, "assets/permits_animation.gif", delay = 50)
)

Philadelphia Building Permits, 2013 - 2022

3 Methods

3.1 Data

Show the code
ggplot(building_permits, aes(x = as.factor(year))) +
  geom_bar() +
  labs(title = "Permits per Year")

Show the code
ggplot(permits_bg %>% st_drop_geometry, aes(x = permits_count)) +
  geom_histogram() +
  labs(title = "Permits per Block Group per Year",
       subtitle = "Log-Transformed") +
  scale_x_log10() +
  facet_wrap(~year)

Show the code
# 
# tm_shape(permits_bg) +
#   tm_polygons("spat_lag_permits", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
#   tm_facets(along = "year") +
# tm_shape(broad_and_market) +
#   tm_lines(col = "lightgrey") +
#   tm_layout(frame = FALSE)

3.1.1 Feature Engineering

Show the code
permits_bg_long <- permits_bg %>%
                    st_drop_geometry() %>%
                    pivot_longer(
                      cols = c(starts_with("lag"), dist_to_2022),
                      names_to = "Variable",
                      values_to = "Value"
                    )


ggscatter(permits_bg_long, x = "permits_count", y = "Value", facet.by = "Variable",
   add = "reg.line",
   add.params = list(color = "blue", fill = "lightgray"),
   conf.int = TRUE
   ) + stat_cor(method = "pearson", p.accuracy = 0.001, r.accuracy = 0.01)

3.2 Models

4 Results

4.1 OLS Regression

To begin, we run a simple regression incorporating three engineered groups of features: space lag, time lag, and distance to 2022. We include this last variable because of a Philadelphia tax abatement policy that led to a significant increase in residential development in the years immediately before 2022. We will use this as a baseline model to compare to our more complex models.

Show the code
permits_train <- filter(permits_bg %>% select(-mapname), year < 2022)
permits_test <- filter(permits_bg %>% select(-mapname), year == 2022)

reg <- lm(permits_count ~ ., data = st_drop_geometry(permits_train))

predictions <- predict(reg, permits_test)
predictions <- cbind(permits_test, predictions)

predictions <- predictions %>%
                  mutate(abs_error = abs(permits_count - predictions),
                         pct_error = abs_error / permits_count)

ggplot(predictions, aes(x = permits_count, y = predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
mae <- paste0("MAE: ", round(mean(predictions$abs_error, na.rm = TRUE), 2))

tm_shape(predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE) 

4.2 Poisson Model

Given that we are dealing with a skewed distribution of count data

Show the code
poisson_reg <- glm(permits_count ~ ., 
                   family = poisson(link = "log"),
                   data = st_drop_geometry(permits_train))

poisson_predictions <- predict(poisson_reg, permits_test, type = "response")
poisson_predictions <- cbind(permits_test, poisson_predictions)

poisson_predictions <- poisson_predictions %>%
                           mutate(abs_error = abs(permits_count - poisson_predictions),
                                  pct_error = abs_error / permits_count)

ggplot(poisson_predictions, aes(x = permits_count, y = poisson_predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
poisson_mae <- paste0("MAE: ", round(mean(poisson_predictions$abs_error, na.rm = TRUE), 2))

tm_shape(poisson_predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

We find that our OLS model has an MAE of only MAE: 2.44–not bad for such a simple model! Still, it struggles most in the areas where we most need it to succeed, so we will try to introduce better variables and apply a more complex model to improve our predictions.

4.3 Random Forest Regression

Show the code
rf <- randomForest(permits_count ~ ., 
                   data = st_drop_geometry(permits_train),
                   importance = TRUE, 
                   na.action = na.omit)

rf_predictions <- predict(rf, permits_test)
rf_predictions <- cbind(permits_test, rf_predictions)
rf_predictions <- rf_predictions %>%
                  mutate(abs_error = abs(permits_count - rf_predictions),
                         pct_error = abs_error / (permits_count + 0.0001))

ggplot(rf_predictions, aes(x = permits_count, y = rf_predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
rf_mae <- paste0("MAE: ", round(mean(rf_predictions$abs_error, na.rm = TRUE), 2))

tm_shape(rf_predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE) 

5 Discussion

5.1 Accuracy

Predominately, our model overpredicts, which is better than underpredicting, as it facilitates new development.

Show the code
nbins <- as.integer(sqrt(nrow(rf_predictions)))
vline <- mean(rf_predictions$abs_error, na.rm = TRUE)

ggplot(rf_predictions, aes(x = abs_error)) +
  geom_histogram(bins = nbins) +
  geom_vline(aes(xintercept = vline))

5.2 Generalizabiltiy

Show the code
tm_shape(acs19) +
        tm_polygons(col = "Percent_Nonwhite", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

Show the code
rf_predictions <- rf_predictions %>%
                      mutate(race_comp = case_when(
                        percent_nonwhite >= .50 ~ "Majority Non-White",
                        TRUE ~ "Majority White"
                      ))

ggplot(rf_predictions, aes(y = abs_error, color = race_comp)) +
  geom_boxplot(fill = NA)

How does this generalize across neighborhoods?

How does this generalize across council districts?

5.3 Assessing Upzoning Needs

We can identify conflict between projected development and current zoning.

Look at zoning that is industrial or residential single family in areas that our model suggests are high development risk for 2023:

Show the code
filtered_zoning <- zoning %>%
                     filter(str_detect(CODE, "RS") | str_detect(CODE, "I"))


tm_shape(filtered_zoning) +
        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

We can extract development predictions at the block level to these parcels and then visualize them by highest need.

Show the code
filtered_zoning <- st_join(
  filtered_zoning,
  rf_predictions %>% select(rf_predictions)
)

tm_shape(filtered_zoning) +
        tm_polygons(col = "rf_predictions", border.alpha = 0, colorNA = "lightgrey", palette = "viridis", style = "fisher") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

Show the code
tmap_mode('view')

filtered_zoning %>%
  filter(rf_predictions > 10) %>%
tm_shape() +
        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey",
                    popup.vars = c('rf_predictions', 'CODE')) +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

Furthermore, we can identify properties with high potential for assemblage, which suggests the ability to accomodate high-density, multi-unit housing.

Show the code
nbs <- filtered_zoning %>% 
  mutate(nb = st_contiguity(geometry))

# Create edge list while handling cases with no neighbors
edge_list <- tibble::tibble(id = 1:length(nbs$nb), nbs = nbs$nb) %>% 
  tidyr::unnest(nbs) %>% 
  filter(nbs != 0)

# Create a graph with a node for each row in filtered_zoning
g <- make_empty_graph(n = nrow(filtered_zoning))
V(g)$name <- as.character(1:nrow(filtered_zoning))

# Add edges if they exist
if (nrow(edge_list) > 0) {
  edges <- as.matrix(edge_list)
  g <- add_edges(g, c(t(edges)))
}

# Calculate the number of contiguous neighbors, handling nodes without neighbors
n_contiguous <- sapply(V(g)$name, function(node) {
  if (node %in% edges) {
    length(neighborhood(g, order = 1, nodes = as.numeric(node))[[1]])
  } else {
    1  # Nodes without neighbors count as 1 (themselves)
  }
})

filtered_zoning <- filtered_zoning %>%
                    mutate(n_contig = n_contiguous)

filtered_zoning %>%
  st_drop_geometry() %>%
  select(rf_predictions,
         n_contig,
         OBJECTID,
         CODE) %>%
  filter(rf_predictions > 10,
         n_contig > 2) %>%
  kablerize(caption = "Poorly-Zoned Properties with High Development Risk")
Poorly-Zoned Properties with High Development Risk
rf_predictions n_contig OBJECTID CODE
275 12.07037 3 604 I2
362 10.39053 6 739 ICMX
858 21.13753 6 1524 ICMX
912 10.39053 8 1611 ICMX
916 21.13753 8 1615 ICMX
1522 10.86167 4 2559 RSA5
1631 21.13753 3 2736 IRMX
1643 21.13753 6 2756 ICMX
1670 21.13753 7 2803 ICMX
1671 21.13753 3 2804 IRMX
1865 35.22817 3 3128 IRMX
2251 11.71790 4 3744 IRMX
3168 10.37503 3 5506 RSA5
3267 10.39053 6 5690 I2
3439 21.13753 6 6067 ICMX
3598 10.39053 7 6362 RSA5
3616 21.13753 3 6405 RSA5
3846 35.22817 3 6901 ICMX
3868 10.39053 3 6953 ICMX
3973 10.39053 7 7195 RSA5
4155 27.93903 3 7646 ICMX
4178 11.19160 3 7704 IRMX
4697 23.73770 3 9094 RSA5
4752 14.52963 5 9244 IRMX
4800 10.37503 3 9371 ICMX
4910 21.13753 3 9662 RSA5
5211 39.34947 3 10411 ICMX
5212 39.34947 3 10412 RSA5
5213 39.34947 3 10413 ICMX
5345 17.71353 3 10760 IRMX
5501 41.10897 3 11150 ICMX
5506 39.34947 3 11161 RSA5
5616 10.86167 3 11449 RSA3
5996 11.36453 4 12339 I2
6206 11.36453 3 12808 RSA5
6258 11.36453 3 12932 ICMX
6312 14.52963 6 13058 ICMX
6412 23.86250 3 13314 IRMX
6703 21.76530 3 13979 I3
6705 11.36453 3 13981 RSA3
6720 12.96780 3 14019 RSA5
6793.2 11.36453 8 14223 I2
6794 21.76530 3 14224 ICMX
6808 21.76530 3 14257 I2
6855 11.36453 3 14373 RSA5
6864 11.36453 3 14401 RSA5
6865 11.36453 3 14402 RSA5
6965 12.96780 3 14649 ICMX
7192 10.86167 3 15167 RSA2
7446 41.10897 3 15720 I2
7487 11.36453 6 15820 RSA5
7637 12.96780 3 16181 RSA5
7885 41.10897 3 16719 RSA5
8105 15.86130 3 17170 ICMX
8132 12.96780 3 17217 ICMX
8220 15.77780 3 17410 RSA5
8543 15.61970 3 18033 RSD3
9075 15.61970 3 19078 RSA3
9189 10.39053 4 19279 RSA5
9362 10.86167 3 19593 RSA3
9607 21.13753 4 20075 ICMX
9802 12.07037 3 20444 RSA5
9854 15.61970 4 20536 RSA2
10377 12.96780 3 21529 ICMX
10651 15.61970 3 22004 RSD1
12877 27.93903 4 25778 RSA5
12929 10.86167 4 25868 RSA1
13164 15.61970 3 26249 RSD1
13494 11.11740 3 26759 RSA5
13599 10.86167 3 26935 RSA3
13817 10.86167 3 27284 RSA2
13892 10.86167 3 27394 RSD3
14110 12.96780 4 27776 I2
14158 18.05380 3 27871 IRMX
14719 10.39053 28 28949 I2
14719.2 21.13753 28 28949 I2
Show the code
tmap_mode('view')

filtered_zoning %>%
  select(rf_predictions,
         n_contig,
         OBJECTID,
         CODE) %>%
  filter(rf_predictions > 10,
         n_contig > 2) %>%
tm_shape() +
        tm_polygons(col = "rf_predictions", border.alpha = 0, colorNA = "lightgrey", palette = "viridis", style = "fisher",
                    popup.vars = c('rf_predictions', 'n_contig', 'CODE'), alpha = 0.5) +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

6 Conclusion

7 Appendices